In [1]:
from gensim import corpora, models, similarities, utils
import numpy as np

In [2]:
dictionary = corpora.Dictionary.load('brown.dict')
corpus = corpora.MmCorpus('brown.mm')
print corpus


MmCorpus(57340 documents, 26840 features, 981295 non-zero entries)

In [3]:
# 先建立200維的LSI model
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=200)

Initialize Query Structures

先建立similarity matrix


In [4]:
index = similarities.MatrixSimilarity(lsi[corpus], num_features=200)
index.save('brown.lsi.sim')
index = similarities.MatrixSimilarity.load('brown.lsi.sim')

接受使用者輸入,並轉換為query


In [5]:
query = "Kids likes to watch dog"
qvec = dictionary.doc2bow(query.lower().split())
qlsi = lsi[qvec]

從similarity matrix中尋找與query最接近的vector


In [6]:
index[qlsi]


Out[6]:
array([ -2.23966595e-03,  -3.36699886e-05,   2.34704033e-01, ...,
         8.20740825e-04,   8.30179662e-04,   2.31815144e-01], dtype=float32)

In [7]:
def search_query(query):
    qvec = dictionary.doc2bow(query.lower().split())
    qlsi = lsi[qvec]
    m = np.argmax(index[qlsi])
    return [dictionary[id] for id, _ in corpus[m]]

In [8]:
search_query(query)


Out[8]:
[u'to', u'monument']

In [ ]: